In [1]:
# preambles
import networkx as nx
import cPickle as pickle
import os
# load pickled network file
pickle_name = 'Data_world8_network.pckl'
pickle_dir = 'C:\\Users\\FG\\Desktop\\PhD\\Research\\reddit\\Pickled Data'
reddit_network = pickle.load( open(pickle_dir + os.sep + pickle_name, "rb") )
print "loaded reddit network from ", pickle_dir + os.sep + pickle_name
In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
#create graph
reddit_graph = nx.Graph(name='Reddit Graph')
# add edges (nodes added automatically)
for userA in reddit_network.keys():
for userB in reddit_network[userA].keys():
# add only if enough messages between the two users
if len(reddit_network[userA][userB])>=2 and (userA != userB):
reddit_graph.add_edge(userA, userB)
# print reddit_graph[userA][userB]['polarity']
# save largest connected subgraph
reddit_graph = sorted(nx.connected_component_subgraphs(reddit_graph), key = len, reverse=True)[0]
In [4]:
# Stats
# print 'degrees:',nx.degree_histogram(reddit_graph)
print 'info: ', nx.info(reddit_graph)
print 'density: ', nx.density(reddit_graph), ' (0-1 scale, 0 for empty graph, 1 for complete graph)'
# triangles
triangles_dict = nx.triangles(reddit_graph)
nodes_in_triangles = [node for node in reddit_graph.nodes() if triangles_dict[node]>=1]
print '# of triangles in graph:', sum(nx.triangles(reddit_graph).values())/3
# plot the degre dist
print "Degree Distribution"
hist=nx.degree_histogram(reddit_graph)
plt.bar(range(len(hist)), hist, align='center')
plt.xlim(0,30)
plt.show()
In [5]:
# extract list of triangles
triangle_list=[]
done=set()
for n in reddit_graph:
done.add(n) #
nbrdone=set() #
nbrs=set(reddit_graph[n])
for nbr in nbrs:
if nbr in done: #
continue #
nbrdone.add(nbr) #
for both in nbrs.intersection(reddit_graph[nbr]):
if both in done or both in nbrdone: #
continue #
triangle_list.append( (n,nbr,both) )
print len(triangle_list)
In [6]:
#reconstruct graph from triangles (with only edges in those triangles)
from itertools import combinations
triangles_reddit_graph = nx.Graph(name='Triangles Reddit Graph')
for triangle in triangle_list:
for n1, n2 in combinations(triangle,2):
# add edge
triangles_reddit_graph.add_edge(n1, n2)
In [7]:
# Stats
# print 'degrees:',nx.degree_histogram(reddit_graph)
print 'info: ', nx.info(triangles_reddit_graph)
print 'density: ', nx.density(triangles_reddit_graph), ' (0-1 scale, 0 for empty graph, 1 for complete graph)'
# triangles
triangles_dict = nx.triangles(triangles_reddit_graph)
nodes_in_triangles = [node for node in triangles_reddit_graph.nodes() if triangles_dict[node]>=1]
print '# of triangles in graph:', sum(nx.triangles(triangles_reddit_graph).values())/3
# plot the degre dist
print "Degree Distribution"
hist=nx.degree_histogram(triangles_reddit_graph)
plt.bar(range(len(hist)), hist, align='center')
plt.xlim(0,30)
plt.show()
In [8]:
# distribution of types of triangles in a random network
# right half of table 4 in paper
In [9]:
import random
# # randomize edges
for edge in triangles_reddit_graph.edges():
# #-1 for neg, +1 for pos
triangles_reddit_graph.edge[edge[0]][edge[1]]['polarity'] = 2*random.randint(0,1)-1
In [10]:
# analyse triangle types
from itertools import combinations
# counts of +++, ++-, +--, --- triangles respectively
counts = {'+++':0, '++-':0, '+--':0, '---':0}
for triangle in triangle_list:
pairs = combinations(triangle, 2)
tot = 0
for pair in pairs:
tot += triangles_reddit_graph.edge[pair[0]][pair[1]]['polarity']
if tot == 3:
counts['+++'] += 1
elif tot == 1:
counts['++-'] += 1
elif tot == -1:
counts['+--'] += 1
elif tot == -3:
counts['---'] += 1
else:
print "ERROR"
print "counts in a randomized network: ", counts
In [11]:
# apply opinion finder to each edge
import string
for edge in triangles_reddit_graph.edges():
# extract text data
userA, userB = edge[0],edge[1]
# print userA, userB
text_list = reddit_network[userA][userB]
#print text_list
# save text to temp file
with open("tmp.txt", "w") as text_file:
for msg in text_list:
# remove non-ascii for opinionfinder
msg_filtered = filter(lambda x: x in string.printable, msg)
# print msg_filtered
text_file.write(msg_filtered + '\n')
# analyse polarity
os.system('java -classpath opinionfinderv2.0\lib\weka.jar;opinionfinderv2.0\lib\stanford-postagger.jar;opinionfinderv2.0\opinionfinder.jar opin.main.RunOpinionFinder tmp.txt -m opinionfinderv2.0\models -l opinionfinderv2.0\lexicons')
# open polarity results
with open("tmp.txt_auto_anns\markup.txt", "r") as res_file:
content = res_file.read()
positives = content.count("positive")
negatives = content.count("negative")
neutrals = content.count("neutral")
print userA, userB, positives, negatives, neutrals
# assign polarity
if negatives>positives:
polarity_val = -1
elif negatives<positives:
polarity_val = 1
else:
polarity_val = 0
triangles_reddit_graph.edge[userA][userB]['polarity'] = polarity_val
print "DONE"
In [12]:
# analyse triangle types
# counts of +++, ++-, +--, --- triangles respectively
counts = {'+++':0, '++-':0, '+--':0, '---':0}
for triangle in triangle_list:
pairs = combinations(triangle, 2)
tot = 0
for pair in pairs:
tot += triangles_reddit_graph.edge[pair[0]][pair[1]]['polarity']
if tot == 3:
counts['+++'] += 1
elif tot == 1:
counts['++-'] += 1
elif tot == -1:
counts['+--'] += 1
elif tot == -3:
counts['---'] += 1
else:
pass
print "counts in a labeled network: ", counts
In [38]:
# Subgroup detection aproximation algorithm
# param from paper
alpha = 0.5
max_cluster_num = 5
#network to work on
work_graph = triangles_reddit_graph.copy()
best_cluster_graph = work_graph.copy()
best_price = float("inf")
# for each cluster size >=2
for t in range(max_cluster_num+1)[2:]:
# partition network into t clusters randomly
# (assign random value to node)
for node in work_graph.nodes():
work_graph.node[node]['cluster'] = random.randint(1,t)
# compute price of partition
cur_price = 0
for edge in work_graph.edges():
userA, userB, pol = edge[0], edge[1], work_graph.edge[edge[0]][edge[1]]['polarity']
if pol == -1 and (work_graph.node[userA]['cluster'] == work_graph.node[userB]['cluster']):
# same group, disagreement
cur_price += (1-alpha)
elif pol == 1 and (work_graph.node[userA]['cluster'] != work_graph.node[userB]['cluster']):
# dif groups, agreement
cur_price += alpha
else:
# do not increase price
pass
print "clusters: ", t, "initial price", cur_price
if cur_price < best_price:
best_cluster_graph = work_graph.copy()
best_price = cur_price
# look at all neighbor clusters
copy_graph = work_graph.copy()
# while neigbors are better
neighbor_better = True
while neighbor_better == True:
# set loop break
neighbor_better = False
# -1 try each node as switch
for switch_node in copy_graph.nodes():
# save cluster value
save_val = copy_graph.node[switch_node]['cluster']
# change cluster value
for cluster_val in [x for x in range(max_cluster_num+1)[2:] if x != save_val]:
# try new value
copy_graph.node[node]['cluster'] = cluster_val
#compute new price
neigh_price = 0
for edge in copy_graph.edges():
userA, userB, pol = edge[0], edge[1], copy_graph.edge[edge[0]][edge[1]]['polarity']
if pol == -1 and (copy_graph.node[userA]['cluster'] == copy_graph.node[userB]['cluster']):
# same group, disagreement
neigh_price += (1-alpha)
elif pol == 1 and (copy_graph.node[userA]['cluster'] != copy_graph.node[userB]['cluster']):
# dif groups, agreement
neigh_price += alpha
else:
# do not increase price
pass
# assign new values if better
if neigh_price < cur_price:
cur_price = neigh_price
best_switch = [switch_node, cluster_val]
neighbor_better = True
if cur_price < best_price:
best_cluster_graph = copy_graph.copy()
best_price = cur_price
# reset value
copy_graph.node[switch_node]['cluster'] = save_val
# -2 try each edge as switch
for switch_edge in copy_graph.edges():
# switch values
copy_graph.node[switch_edge[0]]['cluster'],copy_graph.node[switch_edge[1]]['cluster'] = copy_graph.node[switch_edge[1]]['cluster'],copy_graph.node[switch_edge[0]]['cluster']
#compute new price
neigh_price = 0
for edge in copy_graph.edges():
userA, userB, pol = edge[0], edge[1], copy_graph.edge[edge[0]][edge[1]]['polarity']
if pol == -1 and (copy_graph.node[userA]['cluster'] == copy_graph.node[userB]['cluster']):
# same group, disagreement
neigh_price += (1-alpha)
elif pol == 1 and (copy_graph.node[userA]['cluster'] != copy_graph.node[userB]['cluster']):
# dif groups, agreement
neigh_price += alpha
else:
# do not increase price
pass
# assign new price if better
if neigh_price < cur_price:
cur_price = neigh_price
best_switch = switch_edge
neighbor_better = True
if cur_price < best_price:
best_cluster_graph = copy_graph.copy()
best_price = cur_price
# reset values
copy_graph.node[switch_edge[0]]['cluster'],copy_graph.node[switch_edge[1]]['cluster'] = copy_graph.node[switch_edge[1]]['cluster'],copy_graph.node[switch_edge[0]]['cluster']
#apply switch
if type(best_switch) == tuple:
# best switch is edge
copy_graph.node[best_switch[0]]['cluster'],copy_graph.node[best_switch[1]]['cluster'] = copy_graph.node[best_switch[1]]['cluster'],copy_graph.node[best_switch[0]]['cluster']
elif type(best_switch) == list:
# best switch is node'
copy_graph.node[best_switch[0]]['cluster'] = best_switch[1]
print neighbor_better
print "best neighbor",cur_price
In [44]:
print best_price
for node in best_cluster_graph.nodes():
print node, best_cluster_graph.node[node]['cluster']
In [35]:
for edge in triangles_reddit_graph.edges():
print type(edge)
break
for node in triangles_reddit_graph.nodes():
print type(node)
break
print type(('test', 1)) == tuple
In [14]:
# !cd opinionfinderv2.0\
!java -classpath opinionfinderv2.0\lib\weka.jar;opinionfinderv2.0\lib\stanford-postagger.jar;opinionfinderv2.0\opinionfinder.jar opin.main.RunOpinionFinder opinionfinderv2.0\README.txt -m opinionfinderv2.0\models -l opinionfinderv2.0\lexicons